About this page

This website has a Flexdashboard containing a few plots on NYC restaurant inspection data.

The NYC restaurant inspections dataset has 397584 observations and 18 variables. Some observations have no dba, which have been filtered out from later analysis. Some observations have a date of 1900-01-01, which is obviously in error. These were also filtered out from subsequent analysis showing critical violations by stores over the years.

rest_inspec |>
  filter(!(boro == "Missing") & !is.na(score)) |> 
  mutate(
    boro = factor(boro),
    boro = fct_relevel(boro, c("STATEN ISLAND", "BRONX", "QUEENS", "MANHATTAN", "BROOKLYN"))
  ) |> 
  plot_ly(x = ~boro, y = ~score, color = ~boro, type = "violin", colors = "viridis") |> 
  layout(
    title = "NYC DOHMH restaurant sanitary inspection scores by borough",
    yaxis = list(title = "Restaurant sanitation score"),
    xaxis = list(title = "Borough"),
    shapes = list(
      list(type = "line", y0 = 14, y1 = 14, x0 = -0.5, x1 = 4.5, line = list(width = 1, dash = "dash")),
      list(type = "line", y0 = 28, y1 = 28, x0 = -0.5, x1 = 4.5, line = list(width = 1, dash = "dash"))
    )  
  ) |> 
  add_annotations(
    x = -0.5, y = 7, xref = "x", yref = "y", text = "A\n0-13", showarrow = F, font = list(size = 14)
  ) |> 
  add_annotations(
    x = -0.5, y = 21, xref = "x", yref = "y", text = "B\n14-27", showarrow = F, font = list(size = 14)
  ) |> 
  add_annotations(
    x = -0.5, y = 35, xref = "x", yref = "y", text = "C\n28+", showarrow = F, font = list(size = 14)
  )

This code chunk for the restaurants with the most critical reports takes the total number of critical reports by business and orders them in descending order to see the most frequently reported businesses. Dunkin’ Donuts and Dunkin’ Donuts/Baskin Robbins are their own separate entry, so these are combined to Dunkin’ Donuts. head is used to extract the top 20 worst offenders.

critical_restaurants_levels <- 
  rest_inspec |> 
  filter(critical_flag == "Critical" & year(inspection_date) != 1900) |> 
  mutate(
    dba = case_match(
      dba,
      "DUNKIN' DONUTS, BASKIN ROBBINS" ~ "DUNKIN' DONUTS",
      .default = dba
    )
  ) |> 
  group_by(dba) |> 
  summarize(critical_reports = n()) |> 
  arrange(desc(critical_reports)) |>  
  head(n = 100) |> 
  pull(dba) 
  
rest_inspec |> 
  filter(critical_flag == "Critical" & year(inspection_date) != 1900) |> 
  mutate(
    dba = case_match(
      dba,
      "DUNKIN' DONUTS, BASKIN ROBBINS" ~ "DUNKIN' DONUTS",
      .default = dba
    ), 
    year = factor(year(inspection_date))
  ) |> 
  group_by(dba, year) |> 
  summarize(critical_reports = n()) |> 
  arrange(desc(critical_reports), year) |>  
  head(n = 100) |>
  mutate(
    dba = factor(dba),
    dba = fct_relevel(dba, critical_restaurants_levels)
  ) |> 
  plot_ly(x = ~dba, y = ~critical_reports, type = "bar", color = ~year, colors = "viridis") |> 
  layout(
    title = "NYC DOHMH critical violations by restaurant",
    yaxis = list(title = "Critical violations"),
    xaxis = list(title = "Restaurants")
  )
## `summarise()` has grouped output by 'dba'. You can override using the `.groups`
## argument.
## Warning: There were 41 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `dba = fct_relevel(dba, critical_restaurants_levels)`.
## ℹ In group 1: `dba = "AU BON PAIN"`.
## Caused by warning:
## ! 59 unknown levels in `f`: RED MANGO, JUST SALAD, APPLEBEE'S, SHAKE SHACK, GINO'S PIZZA, MR. WONTON, FAMOUS
## FAMIGLIA PIZZERIA, JIMBO'S HAMBURGER PALACE, PIO PIO, GRAND SICHUAN, KUMO SUSHI, TOASTIES, EUROPA CAFE, PHO
## BANG RESTAURANT, SZECHUAN GOURMET, TWO BOOTS, EL VALLE RESTAURANT, FIKA, …, ROAST KITCHEN, and BOBBY VAN'S
## GRILL
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 40 remaining warnings.

This code chunk gets establishments that were closed (or re-closed) among those with critical reports. To circumvent an encoding issue in an unrelated variable that prevented filtering from running without errors, only the necessary variables are selected for before filtering.

cuisine_levels <-
  rest_inspec |> 
  mutate(
    cuisine_description = iconv(cuisine_description, from = 'UTF-8', to = 'ISO-8859-1')
  ) |> 
  select(boro, critical_flag, action, dba, cuisine_description, inspection_date) |> 
  filter(str_detect(action, "[Cc]losed")) |>  
  group_by(cuisine_description) |> 
  summarize(closed = n()) |> 
  arrange(desc(closed)) |>  
  pull(cuisine_description)

rest_inspec |> 
  mutate(
    cuisine_description = iconv(cuisine_description, from = 'UTF-8', to = 'ISO-8859-1')
  ) |> 
  select(boro, critical_flag, action, dba, cuisine_description, inspection_date) |> 
  filter(str_detect(action, "[Cc]losed")) |>  
  group_by(boro, cuisine_description) |> 
  summarize(closed = n()) |> 
  arrange(desc(closed)) |> 
  mutate(
    cuisine_description = factor(cuisine_description),
    cuisine_description = fct_relevel(cuisine_description, cuisine_levels),
    cuisine_description = fct_recode(cuisine_description, "Latin" = "Latin (Cuban, Dominican, Puerto Rican, South & Central American)", "Bottled beverages" = "Bottled beverages, including water, sodas, juices, etc.")
  ) |> 
  plot_ly(x = ~cuisine_description, y = ~closed, color = ~boro, type = "bar") |> 
  layout(
    title = "Cuisines with the most restaurants closed by the DOHMH",
    yaxis = list(title = "Number of restaurants closed"),
    xaxis = list(title = "Cuisines")
  )
## `summarise()` has grouped output by 'boro'. You can override using the
## `.groups` argument.
rest_inspec |> 
  select(cuisine_description, score, zipcode, dba) |> 
  filter(!is.na(cuisine_description) & !is.na(score) & cuisine_description != "Not Listed/Not Applicable" & zipcode %in% c(10010, 10016, 10001)) |> 
  mutate(
    cuisine_description = factor(cuisine_description),
    cuisine_description = fct_reorder(cuisine_description, score),
    cuisine_description = fct_recode(cuisine_description, "Latin" = "Latin (Cuban, Dominican, Puerto Rican, South & Central American)", "Bottled beverages" = "Bottled beverages, including water, sodas, juices, etc.", "Cafe/Coffee/Tea" = "Café/Coffee/Tea"),
    description = str_c("Business: ", dba, "\n Score: ", score, "\nZIP code: ", zipcode)
  ) |> 
  plot_ly(x = ~cuisine_description, y = ~score, color = ~cuisine_description, type = "box", text = ~description) |> 
  layout(
    title = "Scores of restaurants in ZIP codes near me",
    yaxis = list(title = "Restaurant sanitation scores"),
    xaxis = list(title = "Cuisines"),
    shapes = list(
      list(type = "line", y0 = 14, y1 = 14, x0 = 0, x1 = 60, line = list(width = 1, dash = "dash")),
      list(type = "line", y0 = 28, y1 = 28, x0 = 0, x1 = 60, line = list(width = 1, dash = "dash"))
    )
  ) |> 
  add_annotations(
    x = -0.5, y = 6, xref = "x", yref = "y", text = "A\n0-13", showarrow = F
  ) |> 
  add_annotations(
    x = -0.5, y = 21, xref = "x", yref = "y", text = "B\n14-27", showarrow = F
  ) |> 
  add_annotations(
    x = -0.5, y = 33, xref = "x", yref = "y", text = "C\n28+", showarrow = F
  )
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
mn_avg_score <- 
  rest_inspec |> 
  select(inspection_date, score, boro) |> 
  filter(!is.na(inspection_date) & year(inspection_date) != 1900 & boro == "MANHATTAN") |>
  group_by(inspection_date) |> 
  summarize(avg_score = mean(score, na.rm = TRUE)) 

bk_avg_score <- 
  rest_inspec |> 
  select(inspection_date, score, boro) |> 
  filter(!is.na(inspection_date) & year(inspection_date) != 1900 & boro == "BROOKLYN") |>
  group_by(inspection_date) |> 
  summarize(avg_score = mean(score, na.rm = TRUE)) 

bx_avg_score <-
  rest_inspec |> 
  select(inspection_date, score, boro) |> 
  filter(!is.na(inspection_date) & year(inspection_date) != 1900 & boro == "BRONX") |>
  group_by(inspection_date) |> 
  summarize(avg_score = mean(score, na.rm = TRUE)) 

qn_avg_score <-
  rest_inspec |> 
  select(inspection_date, score, boro) |> 
  filter(!is.na(inspection_date) & year(inspection_date) != 1900 & boro == "QUEENS") |>
  group_by(inspection_date) |> 
  summarize(avg_score = mean(score, na.rm = TRUE)) 

si_avg_score <- 
  rest_inspec |> 
  select(inspection_date, score, boro) |> 
  filter(!is.na(inspection_date) & year(inspection_date) != 1900 & boro == "STATEN ISLAND") |>
  group_by(inspection_date) |> 
  summarize(avg_score = mean(score, na.rm = TRUE)) 

plot_ly(si_avg_score, x = ~inspection_date, y = ~avg_score, name = "Staten Island", type = "scatter", mode = "lines", connectgaps = TRUE) |> 
  add_trace(data = bk_avg_score, x = ~inspection_date, y = ~avg_score, name = "Brooklyn", type = "scatter", mode = "lines") |> 
  add_trace(data = bx_avg_score, x = ~inspection_date, y = ~avg_score, name = "The Bronx", type = "scatter", mode = "lines") |> 
  add_trace(data = qn_avg_score, x = ~inspection_date, y = ~avg_score, name = "Brooklyn", type = "scatter", mode = "lines") |> 
  add_trace(data = mn_avg_score, x = ~inspection_date, y = ~avg_score, name = "Manhattan", type = "scatter", mode = "lines") |> layout(
    title = "Average NYC DOHMH restaurant sanitary inspection scores, 10/2011-10/2017",
    yaxis = list(title = "Average restaurant sanitation score"),
    xaxis = list(title = "Inspection Date")
  )